Write a short description about the course and add a link to your github repository here. This is an R markdown (.Rmd) file so you can use R markdown syntax. See the ‘Useful links’ page in the mooc area (chapter 1) for instructions.
This is my Open Data Science course book,containing all the exercise solutions and also some random projects for extra practice.
analysis_data <- read.csv("~/Documents/GitHub/IODS-project/data/learning2014.csv",
sep=",", header=TRUE)
str(analysis_data)
## 'data.frame': 166 obs. of 7 variables:
## $ gender : Factor w/ 2 levels "F","M": 1 2 1 2 2 1 2 1 2 1 ...
## $ age : int 53 55 49 53 49 38 50 37 37 42 ...
## $ attitude: int 37 31 25 35 37 38 35 29 38 21 ...
## $ deep : num 0.97 0.789 0.947 0.947 0.992 ...
## $ stra : num 0.314 0.256 0.307 0.307 0.322 ...
## $ surf : num 0.925 1.134 0.806 0.806 1.015 ...
## $ points : int 25 12 24 10 22 21 21 31 24 26 ...
dim(analysis_data)
## [1] 166 7
summary(analysis_data$gender)
## F M
## 110 56
summary(analysis_data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 21.00 22.00 25.51 27.00 55.00
summary(analysis_data$attitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 26.00 32.00 31.43 37.00 50.00
summary(analysis_data$deep)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4284 0.9019 0.9921 0.9956 1.1049 1.3303
summary(analysis_data$stra)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1389 0.2923 0.3216 0.3227 0.3581 0.4312
summary(analysis_data$surf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5670 0.8655 1.0147 0.9981 1.1341 1.5519
summary(analysis_data$points)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 19.00 23.00 22.72 27.75 33.00
library(GGally)
## Loading required package: ggplot2
library(ggplot2)
ggpairs(analysis_data, mapping = aes( col = gender, alpha = 0.2 ), lower = list(combo = wrap("facethist", bins = 20)))
my_model <- lm(points ~ attitude + age + stra, data = analysis_data)
summary(my_model)
##
## Call:
## lm(formula = points ~ attitude + age + stra, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.206 -3.430 0.204 3.979 10.950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.60773 3.38966 4.605 8.32e-06 ***
## attitude 0.35941 0.05696 6.310 2.56e-09 ***
## age -0.07716 0.05322 -1.450 0.149
## stra -6.87318 8.55576 -0.803 0.423
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.307 on 162 degrees of freedom
## Multiple R-squared: 0.2043, Adjusted R-squared: 0.1896
## F-statistic: 13.87 on 3 and 162 DF, p-value: 4.305e-08
my_model2 <- lm(points ~ attitude, data = analysis_data)
summary(my_model2)
##
## Call:
## lm(formula = points ~ attitude, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.9763 -3.2119 0.4339 4.1534 10.6645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.63715 1.83035 6.358 1.95e-09 ***
## attitude 0.35255 0.05674 6.214 4.12e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.32 on 164 degrees of freedom
## Multiple R-squared: 0.1906, Adjusted R-squared: 0.1856
## F-statistic: 38.61 on 1 and 164 DF, p-value: 4.119e-09
par(mfrow = c(2,2))
plot(my_model, which = c(1, 2, 5))
analysis_data_new <- analysis_data[-c(2, 4, 56), ]
my_model3 <- lm(points ~ attitude + age + stra, data = analysis_data_new)
summary(my_model3)
##
## Call:
## lm(formula = points ~ attitude + age + stra, data = analysis_data_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.8988 -3.4558 0.0807 3.8415 11.5969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.30158 3.21710 4.445 1.64e-05 ***
## attitude 0.38062 0.05399 7.049 5.18e-11 ***
## age 0.04040 0.05658 0.714 0.476
## stra -13.31399 8.20912 -1.622 0.107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.01 on 159 degrees of freedom
## Multiple R-squared: 0.2415, Adjusted R-squared: 0.2271
## F-statistic: 16.87 on 3 and 159 DF, p-value: 1.455e-09
library("readxl")
library("ggplot2")
library("data.table")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("stringr")
library("tidyr")
library("GGally")
library("ggplot2")
library("MASS")
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
analysis.data <- read.csv("~/Documents/GitHub/IODS-project/data/alc.csv",
sep=",", header=TRUE)
glimpse(analysis.data)
## Observations: 382
## Variables: 35
## $ school <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP,...
## $ sex <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F, ...
## $ age <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15,...
## $ address <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, ...
## $ famsize <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, G...
## $ Pstatus <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T, ...
## $ Medu <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, ...
## $ Fedu <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, ...
## $ Mjob <fct> at_home, at_home, at_home, health, other, services,...
## $ Fjob <fct> teacher, other, other, services, other, other, othe...
## $ reason <fct> course, course, other, home, home, reputation, home...
## $ nursery <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, ye...
## $ internet <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes,...
## $ guardian <fct> mother, father, mother, mother, father, mother, mot...
## $ traveltime <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, ...
## $ studytime <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, ...
## $ failures <int> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ schoolsup <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no, ...
## $ famsup <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes,...
## $ paid <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes, ...
## $ activities <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes,...
## $ higher <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, y...
## $ romantic <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no...
## $ famrel <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, ...
## $ freetime <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, ...
## $ goout <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, ...
## $ Dalc <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Walc <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, ...
## $ health <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, ...
## $ absences <int> 5, 3, 8, 1, 2, 8, 0, 4, 0, 0, 1, 2, 1, 1, 0, 5, 8, ...
## $ G1 <int> 2, 7, 10, 14, 8, 14, 12, 8, 16, 13, 12, 10, 13, 11,...
## $ G2 <int> 8, 8, 10, 14, 12, 14, 12, 9, 17, 14, 11, 12, 14, 11...
## $ G3 <int> 8, 8, 11, 14, 12, 14, 12, 10, 18, 14, 12, 12, 13, 1...
## $ alc_use <dbl> 1.0, 1.0, 2.5, 1.0, 1.5, 1.5, 1.0, 1.0, 1.0, 1.0, 1...
## $ high_use <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FAL...
gather(analysis.data) %>% glimpse
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Observations: 13,370
## Variables: 2
## $ key <chr> "school", "school", "school", "school", "school", "schoo...
## $ value <chr> "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "G...
gather(analysis.data) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped
library("dplyr")
chosen.data <- analysis.data[, c("age", "sex", "Pstatus", "famrel", "high_use")]
# Draw a bar plot of chosen variables
gather(chosen.data) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped
# Draw a correlation plot of chosen variables
ggpairs(chosen.data, mapping = aes( col = sex, alpha = 0.2 ), lower = list(combo = wrap("facethist", bins = 20)))
# Get cross tabulation tables for variables
library(gmodels)
CrossTable(chosen.data$sex, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$sex | FALSE | TRUE | Row Total |
## ----------------|-----------|-----------|-----------|
## F | 156 | 42 | 198 |
## | 2.102 | 4.942 | |
## | 0.788 | 0.212 | 0.518 |
## | 0.582 | 0.368 | |
## | 0.408 | 0.110 | |
## ----------------|-----------|-----------|-----------|
## M | 112 | 72 | 184 |
## | 2.262 | 5.318 | |
## | 0.609 | 0.391 | 0.482 |
## | 0.418 | 0.632 | |
## | 0.293 | 0.188 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## ----------------|-----------|-----------|-----------|
##
##
CrossTable(chosen.data$age, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$age | FALSE | TRUE | Row Total |
## ----------------|-----------|-----------|-----------|
## 15 | 63 | 18 | 81 |
## | 0.671 | 1.576 | |
## | 0.778 | 0.222 | 0.212 |
## | 0.235 | 0.158 | |
## | 0.165 | 0.047 | |
## ----------------|-----------|-----------|-----------|
## 16 | 79 | 28 | 107 |
## | 0.206 | 0.484 | |
## | 0.738 | 0.262 | 0.280 |
## | 0.295 | 0.246 | |
## | 0.207 | 0.073 | |
## ----------------|-----------|-----------|-----------|
## 17 | 64 | 36 | 100 |
## | 0.540 | 1.270 | |
## | 0.640 | 0.360 | 0.262 |
## | 0.239 | 0.316 | |
## | 0.168 | 0.094 | |
## ----------------|-----------|-----------|-----------|
## 18 | 54 | 27 | 81 |
## | 0.141 | 0.331 | |
## | 0.667 | 0.333 | 0.212 |
## | 0.201 | 0.237 | |
## | 0.141 | 0.071 | |
## ----------------|-----------|-----------|-----------|
## 19 | 7 | 4 | 11 |
## | 0.067 | 0.157 | |
## | 0.636 | 0.364 | 0.029 |
## | 0.026 | 0.035 | |
## | 0.018 | 0.010 | |
## ----------------|-----------|-----------|-----------|
## 20 | 1 | 0 | 1 |
## | 0.127 | 0.298 | |
## | 1.000 | 0.000 | 0.003 |
## | 0.004 | 0.000 | |
## | 0.003 | 0.000 | |
## ----------------|-----------|-----------|-----------|
## 22 | 0 | 1 | 1 |
## | 0.702 | 1.649 | |
## | 0.000 | 1.000 | 0.003 |
## | 0.000 | 0.009 | |
## | 0.000 | 0.003 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## ----------------|-----------|-----------|-----------|
##
##
CrossTable(chosen.data$Pstatus, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$Pstatus | FALSE | TRUE | Row Total |
## --------------------|-----------|-----------|-----------|
## A | 26 | 12 | 38 |
## | 0.016 | 0.038 | |
## | 0.684 | 0.316 | 0.099 |
## | 0.097 | 0.105 | |
## | 0.068 | 0.031 | |
## --------------------|-----------|-----------|-----------|
## T | 242 | 102 | 344 |
## | 0.002 | 0.004 | |
## | 0.703 | 0.297 | 0.901 |
## | 0.903 | 0.895 | |
## | 0.634 | 0.267 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## --------------------|-----------|-----------|-----------|
##
##
CrossTable(chosen.data$famrel, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$famrel | FALSE | TRUE | Row Total |
## -------------------|-----------|-----------|-----------|
## 1 | 6 | 2 | 8 |
## | 0.027 | 0.063 | |
## | 0.750 | 0.250 | 0.021 |
## | 0.022 | 0.018 | |
## | 0.016 | 0.005 | |
## -------------------|-----------|-----------|-----------|
## 2 | 10 | 9 | 19 |
## | 0.832 | 1.955 | |
## | 0.526 | 0.474 | 0.050 |
## | 0.037 | 0.079 | |
## | 0.026 | 0.024 | |
## -------------------|-----------|-----------|-----------|
## 3 | 39 | 25 | 64 |
## | 0.775 | 1.823 | |
## | 0.609 | 0.391 | 0.168 |
## | 0.146 | 0.219 | |
## | 0.102 | 0.065 | |
## -------------------|-----------|-----------|-----------|
## 4 | 135 | 54 | 189 |
## | 0.044 | 0.102 | |
## | 0.714 | 0.286 | 0.495 |
## | 0.504 | 0.474 | |
## | 0.353 | 0.141 | |
## -------------------|-----------|-----------|-----------|
## 5 | 78 | 24 | 102 |
## | 0.580 | 1.362 | |
## | 0.765 | 0.235 | 0.267 |
## | 0.291 | 0.211 | |
## | 0.204 | 0.063 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## -------------------|-----------|-----------|-----------|
##
##
# Model with glm()
chosen.mod <- glm(high_use ~ Pstatus + age + sex + famrel, data = chosen.data, family = "binomial")
summary(chosen.mod)
##
## Call:
## glm(formula = high_use ~ Pstatus + age + sex + famrel, family = "binomial",
## data = chosen.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5439 -0.8479 -0.6640 1.2254 2.0504
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.88778 1.71500 -2.267 0.0234 *
## PstatusT -0.13249 0.38409 -0.345 0.7301
## age 0.23479 0.09865 2.380 0.0173 *
## sexM 0.94515 0.23601 4.005 6.21e-05 ***
## famrel -0.32119 0.12560 -2.557 0.0106 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 465.68 on 381 degrees of freedom
## Residual deviance: 438.92 on 377 degrees of freedom
## AIC: 448.92
##
## Number of Fisher Scoring iterations: 4
# Compute odds ratios (OR)
chosen.mod.OR <- coef(chosen.mod) %>% exp
# Compute confidence intervals (CI)
confint(chosen.mod)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -7.29460682 -0.54975516
## PstatusT -0.86780384 0.64983706
## age 0.04291218 0.43089580
## sexM 0.48770904 1.41454481
## famrel -0.56968043 -0.07549712
chosen.mod.CI <- confint(chosen.mod) %>% exp
## Waiting for profiling to be done...
# print out the odds ratios with their confidence intervals
cbind(chosen.mod.OR, chosen.mod.CI)
## chosen.mod.OR 2.5 % 97.5 %
## (Intercept) 0.02049085 0.0006791919 0.5770911
## PstatusT 0.87590845 0.4198726450 1.9152287
## age 1.26464541 1.0438462211 1.5386352
## sexM 2.57320663 1.6285809238 4.1146131
## famrel 0.72528788 0.5657061903 0.9272824
# predict() the probability of high_use
probabilities <- predict(chosen.mod, type = "response")
# add the predicted probabilities to 'alc'
chosen.data <- mutate(chosen.data, probability = probabilities)
# use the probabilities to make a prediction of high_use
chosen.data <- mutate(chosen.data, prediction = probabilities > 0.5)
# tabulate the target variable versus the predictions
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 259 9
## TRUE 98 16
# initialize a plot of 'high_use' versus 'probability' in 'alc'
g <- ggplot(chosen.data, aes(x = probability, y = high_use, col = prediction))
# define the geom as points and draw the plot
g + geom_point()
# tabulate the target variable versus the predictions
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 259 9
## TRUE 98 16
# Adjust the code: Use %>% to apply the prop.table() function on the output of table()
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction) %>% prop.table()
## prediction
## high_use FALSE TRUE
## FALSE 0.67801047 0.02356021
## TRUE 0.25654450 0.04188482
#Adjust the code: Use %>% to apply the addmargins() function on the output of prop.table()
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction) %>% prop.table() %>% addmargins()
## prediction
## high_use FALSE TRUE Sum
## FALSE 0.67801047 0.02356021 0.70157068
## TRUE 0.25654450 0.04188482 0.29842932
## Sum 0.93455497 0.06544503 1.00000000
# define a loss function (mean prediction error)
loss_func <- function(class, prob) {
n_wrong <- abs(class - prob) > 0.5
mean(n_wrong)
}
# call loss_func to compute the average number of wrong predictions in the (training) data using the above function
loss_func(class = chosen.data$high_use, prob = chosen.data$probability)
## [1] 0.2801047
# Calculate missclassification error using an R package to confirm the above results
library(InformationValue)
optCutOff <- optimalCutoff(chosen.data$high_use, chosen.data$prediction)[1]
misClassError(chosen.data$high_use, chosen.data$prediction, threshold = optCutOff)
## [1] 0.2801
# Calculating concordance
Concordance(chosen.data$high_use, chosen.data$prediction)
## $Concordance
## [1] 0.1356376
##
## $Discordance
## [1] 0.8643624
##
## $Tied
## [1] -1.110223e-16
##
## $Pairs
## [1] 30552
# Calculating sensitivity/specificty of our model
sensitivity(chosen.data$high_use, chosen.data$prediction, threshold = optCutOff)
## [1] 0.1403509
specificity(chosen.data$high_use, chosen.data$prediction, threshold = optCutOff)
## [1] 0.9664179
# Construct an ROC curve
library(plotROC)
plotROC(chosen.data$high_use, chosen.data$prediction)
## K-fold cross-validation
library(boot)
K <- nrow(chosen.data) #defines the leave-one-out method
cv_chosen <- cv.glm(data = chosen.data, cost = loss_func, glmfit = chosen.mod, K = 10)
## average number of wrong predictions in the cross validation
cv_chosen$delta[1]
## [1] 0.2853403
What we see above is that the average predicton error from the above model is 0.31 which is much higher than 0.26. Hence, the model explored in Datacamp is better for predictive power of alcohol use than this one explored here.
## Define big model to compare.
big.model <- glm(high_use ~ school + sex + age + address + famsize + Pstatus + G3 + failures + famrel + famsup + freetime + goout + schoolsup + absences + health + Medu + Fedu +
activities + paid, data = analysis.data, family = binomial(link = "logit"))
## Define null model to compare.
null.model <- glm(high_use ~ 1, data = analysis.data, family = binomial(link = "logit"))
## Determining model with step procedure
step.model <- step(null.model, scope = list(upper = big.model), direction = "both",
test = "Chisq", data = analysis.data)
## Start: AIC=467.68
## high_use ~ 1
##
## Df Deviance AIC LRT Pr(>Chi)
## + goout 1 415.68 419.68 50.001 1.537e-12 ***
## + absences 1 447.45 451.45 18.233 1.955e-05 ***
## + sex 1 450.95 454.95 14.732 0.0001239 ***
## + freetime 1 456.84 460.84 8.840 0.0029469 **
## + failures 1 456.98 460.98 8.698 0.0031864 **
## + G3 1 459.43 463.43 6.252 0.0124059 *
## + age 1 460.83 464.83 4.851 0.0276320 *
## + famrel 1 460.92 464.92 4.756 0.0292035 *
## + address 1 462.30 466.30 3.375 0.0661994 .
## + famsize 1 463.48 467.48 2.200 0.1380308
## <none> 465.68 467.68
## + health 1 464.29 468.29 1.389 0.2385700
## + activities 1 464.43 468.43 1.245 0.2645257
## + schoolsup 1 464.51 468.51 1.165 0.2803902
## + paid 1 464.80 468.80 0.877 0.3491564
## + school 1 465.13 469.13 0.553 0.4572172
## + famsup 1 465.19 469.19 0.485 0.4860956
## + Fedu 1 465.46 469.46 0.215 0.6427752
## + Pstatus 1 465.62 469.62 0.060 0.8062405
## + Medu 1 465.63 469.63 0.046 0.8298479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=419.68
## high_use ~ goout
##
## Df Deviance AIC LRT Pr(>Chi)
## + absences 1 402.50 408.50 13.175 0.0002837 ***
## + sex 1 402.74 408.74 12.935 0.0003224 ***
## + famrel 1 408.12 414.12 7.562 0.0059609 **
## + address 1 409.72 415.72 5.960 0.0146306 *
## + failures 1 410.92 416.92 4.753 0.0292404 *
## + G3 1 413.39 419.39 2.293 0.1299925
## + health 1 413.42 419.42 2.263 0.1324990
## + famsize 1 413.51 419.51 2.166 0.1411078
## <none> 415.68 419.68
## + activities 1 413.68 419.68 2.000 0.1573155
## + age 1 414.38 420.38 1.299 0.2543646
## + paid 1 414.53 420.53 1.147 0.2840893
## + freetime 1 414.75 420.75 0.931 0.3345969
## + schoolsup 1 414.77 420.77 0.910 0.3402438
## + school 1 414.79 420.79 0.886 0.3464949
## + famsup 1 415.36 421.36 0.314 0.5753984
## + Pstatus 1 415.54 421.54 0.142 0.7065896
## + Fedu 1 415.57 421.57 0.111 0.7387151
## + Medu 1 415.64 421.64 0.035 0.8522261
## - goout 1 465.68 467.68 50.001 1.537e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=408.5
## high_use ~ goout + absences
##
## Df Deviance AIC LRT Pr(>Chi)
## + sex 1 387.75 395.75 14.748 0.0001229 ***
## + famrel 1 395.79 403.79 6.716 0.0095572 **
## + address 1 397.09 405.09 5.413 0.0199833 *
## + failures 1 398.40 406.40 4.107 0.0427168 *
## + health 1 400.08 408.08 2.427 0.1192936
## + activities 1 400.24 408.24 2.262 0.1325869
## <none> 402.50 408.50
## + famsize 1 400.54 408.54 1.962 0.1613260
## + G3 1 400.92 408.92 1.583 0.2083813
## + paid 1 400.92 408.92 1.582 0.2084805
## + school 1 400.98 408.98 1.526 0.2166950
## + freetime 1 401.21 409.21 1.288 0.2563846
## + schoolsup 1 401.50 409.50 1.003 0.3164808
## + age 1 401.95 409.95 0.550 0.4581523
## + famsup 1 402.06 410.06 0.446 0.5044225
## + Medu 1 402.25 410.25 0.254 0.6144158
## + Fedu 1 402.47 410.47 0.035 0.8512142
## + Pstatus 1 402.50 410.50 0.006 0.9380858
## - absences 1 415.68 419.68 13.175 0.0002837 ***
## - goout 1 447.45 451.45 44.943 2.028e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=395.75
## high_use ~ goout + absences + sex
##
## Df Deviance AIC LRT Pr(>Chi)
## + famrel 1 379.81 389.81 7.946 0.0048195 **
## + address 1 382.70 392.70 5.058 0.0245093 *
## + activities 1 383.70 393.70 4.051 0.0441470 *
## + paid 1 384.50 394.50 3.255 0.0711919 .
## + failures 1 385.06 395.06 2.693 0.1008032
## <none> 387.75 395.75
## + school 1 385.95 395.95 1.803 0.1793763
## + G3 1 386.44 396.44 1.312 0.2520448
## + famsize 1 386.58 396.58 1.176 0.2781516
## + health 1 386.77 396.77 0.985 0.3209508
## + Medu 1 387.02 397.02 0.731 0.3926563
## + age 1 387.09 397.09 0.663 0.4156185
## + freetime 1 387.47 397.47 0.280 0.5964214
## + schoolsup 1 387.61 397.61 0.143 0.7050365
## + famsup 1 387.74 397.74 0.013 0.9095974
## + Fedu 1 387.75 397.75 0.000 0.9944014
## + Pstatus 1 387.75 397.75 0.000 0.9949057
## - sex 1 402.50 408.50 14.748 0.0001229 ***
## - absences 1 402.74 408.74 14.988 0.0001082 ***
## - goout 1 430.07 436.07 42.314 7.773e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=389.81
## high_use ~ goout + absences + sex + famrel
##
## Df Deviance AIC LRT Pr(>Chi)
## + address 1 374.76 386.76 5.044 0.0247052 *
## + activities 1 376.23 388.23 3.581 0.0584394 .
## + paid 1 376.64 388.64 3.168 0.0750934 .
## + failures 1 377.79 389.79 2.020 0.1552693
## <none> 379.81 389.81
## + health 1 377.91 389.91 1.904 0.1676831
## + school 1 378.55 390.55 1.261 0.2615349
## + G3 1 378.81 390.81 1.000 0.3172697
## + famsize 1 378.89 390.89 0.921 0.3372367
## + age 1 378.93 390.93 0.883 0.3473595
## + Medu 1 378.97 390.97 0.835 0.3607934
## + freetime 1 379.07 391.07 0.738 0.3902071
## + schoolsup 1 379.69 391.69 0.122 0.7271839
## + famsup 1 379.77 391.77 0.042 0.8367076
## + Pstatus 1 379.80 391.80 0.013 0.9084113
## + Fedu 1 379.80 391.80 0.005 0.9461156
## - famrel 1 387.75 395.75 7.946 0.0048195 **
## - absences 1 393.80 401.80 13.993 0.0001835 ***
## - sex 1 395.79 403.79 15.979 6.406e-05 ***
## - goout 1 424.86 432.86 45.048 1.923e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=386.76
## high_use ~ goout + absences + sex + famrel + address
##
## Df Deviance AIC LRT Pr(>Chi)
## + activities 1 370.67 384.67 4.094 0.0430315 *
## + paid 1 370.92 384.92 3.844 0.0499390 *
## <none> 374.76 386.76
## + health 1 373.08 387.08 1.685 0.1942380
## + failures 1 373.09 387.09 1.670 0.1962137
## + famsize 1 373.41 387.41 1.353 0.2447313
## + freetime 1 373.99 387.99 0.773 0.3793315
## + G3 1 374.37 388.37 0.392 0.5311653
## + Medu 1 374.46 388.46 0.301 0.5835271
## + age 1 374.47 388.47 0.292 0.5889660
## + school 1 374.49 388.49 0.275 0.6000857
## + schoolsup 1 374.70 388.70 0.063 0.8020037
## + famsup 1 374.73 388.73 0.032 0.8586738
## + Fedu 1 374.74 388.74 0.025 0.8739745
## + Pstatus 1 374.76 388.76 0.000 0.9993591
## - address 1 379.81 389.81 5.044 0.0247052 *
## - famrel 1 382.70 392.70 7.932 0.0048564 **
## - absences 1 388.50 398.50 13.738 0.0002101 ***
## - sex 1 390.30 400.30 15.539 8.084e-05 ***
## - goout 1 422.01 432.01 47.242 6.273e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=384.67
## high_use ~ goout + absences + sex + famrel + address + activities
##
## Df Deviance AIC LRT Pr(>Chi)
## + paid 1 366.49 382.49 4.185 0.0407904 *
## <none> 370.67 384.67
## + health 1 369.12 385.12 1.547 0.2135619
## + failures 1 369.46 385.46 1.214 0.2704852
## + famsize 1 369.53 385.53 1.143 0.2850968
## + freetime 1 369.78 385.78 0.888 0.3460988
## + age 1 370.48 386.48 0.191 0.6621013
## + Fedu 1 370.52 386.52 0.153 0.6960310
## + G3 1 370.54 386.54 0.134 0.7142250
## + school 1 370.55 386.55 0.124 0.7246465
## + Medu 1 370.56 386.56 0.108 0.7421703
## + Pstatus 1 370.61 386.61 0.063 0.8024291
## + famsup 1 370.65 386.65 0.023 0.8805015
## + schoolsup 1 370.65 386.65 0.019 0.8899358
## - activities 1 374.76 386.76 4.094 0.0430315 *
## - address 1 376.23 388.23 5.557 0.0184020 *
## - famrel 1 378.14 390.14 7.466 0.0062860 **
## - absences 1 384.72 396.72 14.051 0.0001779 ***
## - sex 1 387.95 399.95 17.275 3.234e-05 ***
## - goout 1 419.34 431.34 48.665 3.037e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=382.49
## high_use ~ goout + absences + sex + famrel + address + activities +
## paid
##
## Df Deviance AIC LRT Pr(>Chi)
## + failures 1 364.29 382.29 2.198 0.1381945
## <none> 366.49 382.49
## + health 1 364.50 382.50 1.983 0.1590665
## + famsize 1 365.31 383.31 1.180 0.2772746
## + freetime 1 365.43 383.43 1.051 0.3052950
## + famsup 1 366.10 384.10 0.381 0.5368105
## + G3 1 366.11 384.11 0.372 0.5416643
## + Medu 1 366.12 384.12 0.367 0.5444822
## + age 1 366.33 384.33 0.156 0.6931483
## + school 1 366.42 384.42 0.068 0.7936917
## + Fedu 1 366.42 384.42 0.061 0.8047582
## + Pstatus 1 366.47 384.47 0.011 0.9173748
## + schoolsup 1 366.48 384.48 0.003 0.9594304
## - paid 1 370.67 384.67 4.185 0.0407904 *
## - activities 1 370.92 384.92 4.435 0.0352018 *
## - address 1 372.90 386.90 6.409 0.0113517 *
## - famrel 1 374.01 388.01 7.523 0.0060923 **
## - absences 1 381.53 395.53 15.046 0.0001049 ***
## - sex 1 386.10 400.10 19.612 9.487e-06 ***
## - goout 1 415.98 429.98 49.499 1.985e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=382.29
## high_use ~ goout + absences + sex + famrel + address + activities +
## paid + failures
##
## Df Deviance AIC LRT Pr(>Chi)
## <none> 364.29 382.29
## - failures 1 366.49 382.49 2.198 0.1381945
## + health 1 362.70 382.70 1.585 0.2080158
## + famsize 1 363.01 383.01 1.278 0.2582321
## + freetime 1 363.30 383.30 0.983 0.3214915
## + Fedu 1 363.80 383.80 0.484 0.4866195
## + famsup 1 363.87 383.87 0.415 0.5195099
## - activities 1 368.10 384.10 3.817 0.0507355 .
## + school 1 364.20 384.20 0.087 0.7678936
## + Medu 1 364.24 384.24 0.049 0.8248779
## + age 1 364.25 384.25 0.034 0.8532051
## + G3 1 364.27 384.27 0.020 0.8881974
## + schoolsup 1 364.28 384.28 0.005 0.9437839
## + Pstatus 1 364.29 384.29 0.000 0.9839522
## - paid 1 369.46 385.46 5.168 0.0230019 *
## - address 1 370.26 386.26 5.975 0.0145064 *
## - famrel 1 371.11 387.11 6.826 0.0089854 **
## - absences 1 378.63 394.63 14.345 0.0001522 ***
## - sex 1 382.40 398.40 18.113 2.081e-05 ***
## - goout 1 409.81 425.81 45.518 1.512e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
final.model.coeff <- as.data.frame(step.model$coefficients)
final.mod <- glm(high_use ~ sex + address + failures + famrel + goout + absences +
activities + paid, data = analysis.data, family = binomial(link = "logit"))
summary(final.mod)
##
## Call:
## glm(formula = high_use ~ sex + address + failures + famrel +
## goout + absences + activities + paid, family = binomial(link = "logit"),
## data = analysis.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8829 -0.7374 -0.4707 0.6885 2.8657
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.49941 0.73483 -3.401 0.000671 ***
## sexM 1.13462 0.27425 4.137 3.52e-05 ***
## addressU -0.77509 0.31595 -2.453 0.014157 *
## failures 0.32272 0.21815 1.479 0.139051
## famrel -0.37983 0.14566 -2.608 0.009119 **
## goout 0.80068 0.12880 6.217 5.08e-10 ***
## absences 0.08401 0.02218 3.788 0.000152 ***
## activitiesyes -0.51728 0.26676 -1.939 0.052487 .
## paidyes 0.61261 0.27244 2.249 0.024536 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 465.68 on 381 degrees of freedom
## Residual deviance: 364.29 on 373 degrees of freedom
## AIC: 382.29
##
## Number of Fisher Scoring iterations: 5
# Get final model data
final.data <- analysis.data[, c("goout", "absences", "sex", "famrel", "address",
"activities", "paid", "failures", "high_use")]
# predict() the probability of high_use
probabilities.final.mod <- predict(final.mod, type = "response")
# add the predicted probabilities to 'alc'
final.data <- mutate(final.data, probability = probabilities.final.mod)
# use the probabilities to make a prediction of high_use
final.data <- mutate(final.data, prediction = probabilities.final.mod > 0.5)
# tabulate the target variable versus the predictions
table(high_use = final.data$high_use, prediction = final.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 252 16
## TRUE 59 55
# initialize a plot of 'high_use' versus 'probability' in 'alc'
g_final <- ggplot(final.data, aes(x = probability, y = high_use, col = prediction))
# define the geom as points and draw the plot
g_final + geom_point()
# tabulate the target variable versus the predictions
table(high_use = final.data$high_use, prediction = final.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 252 16
## TRUE 59 55
# Adjust the code: Use %>% to apply the prop.table() function on the output of table()
table(high_use = final.data$high_use, prediction = final.data$prediction) %>% prop.table()
## prediction
## high_use FALSE TRUE
## FALSE 0.65968586 0.04188482
## TRUE 0.15445026 0.14397906
#Adjust the code: Use %>% to apply the addmargins() function on the output of prop.table()
table(high_use = final.data$high_use, prediction = final.data$prediction) %>% prop.table() %>% addmargins()
## prediction
## high_use FALSE TRUE Sum
## FALSE 0.65968586 0.04188482 0.70157068
## TRUE 0.15445026 0.14397906 0.29842932
## Sum 0.81413613 0.18586387 1.00000000
# define a loss function (mean prediction error)
loss_func <- function(class, prob) {
n_wrong <- abs(class - prob) > 0.5
mean(n_wrong)
}
# call loss_func to compute the average number of wrong predictions in the (training) data using the above function
loss_func(class = final.data$high_use, prob = final.data$probability)
## [1] 0.1963351
# Calculate missclassification error using an R package to confirm the above results
library(InformationValue)
optCutOff_final <- optimalCutoff(final.data$high_use, final.data$prediction)[1]
misClassError(final.data$high_use, final.data$prediction, threshold = optCutOff_final)
## [1] 0.1963
# Calculating concordance
Concordance(final.data$high_use, final.data$prediction)
## $Concordance
## [1] 0.4536528
##
## $Discordance
## [1] 0.5463472
##
## $Tied
## [1] 0
##
## $Pairs
## [1] 30552
# Calculating sensitivity/specificty of our model
sensitivity(final.data$high_use, final.data$prediction, threshold = optCutOff)
## [1] 0.4824561
specificity(final.data$high_use, final.data$prediction, threshold = optCutOff)
## [1] 0.9402985
# Construct an ROC curve
library(plotROC)
# ROC for Final chosen model
plotROC(final.data$high_use, final.data$prediction)
# ROC for initially chosen model
plotROC(chosen.data$high_use, chosen.data$prediction)
## K-fold cross-validation
library(boot)
K_final <- nrow(final.data) #defines the leave-one-out method
cv_final <- cv.glm(data = final.data, cost = loss_func, glmfit = final.mod, K = 10)
## average number of wrong predictions in the cross validation
cv_final$delta[1]
## [1] 0.2120419
library("readxl")
library("ggplot2")
library("data.table")
library("dplyr")
library("stringr")
library("tidyr")
library("GGally")
library("ggplot2")
library("MASS")
library("corrplot")
## corrplot 0.84 loaded
library("plotly")
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data("Boston")
glimpse(Boston)
## Observations: 506
## Variables: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, ...
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5,...
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, ...
## $ chas <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524...
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172...
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0,...
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605...
## $ rad <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, ...
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311,...
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, ...
## $ black <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60...
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.9...
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, ...
plot(Boston$med)
# General summary of the dataset
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
# Matrix of the variables
pairs(Boston)
# Correlation matrix
cor_matrix <- cor(Boston) %>% round(2)
corrplot(cor_matrix, method = "circle", type = "upper",
cl.pos = "b", tl.pos = "d", tl.cex = 0.6)
# How do other variables stack up against crime rates? Do we see patterns?
molten <- melt(Boston, id = "crim")
ggplot(molten, aes(x = value, y = crim))+
facet_wrap( ~ variable, scales = "free")+
geom_point()
# Centering and standardizing variables
boston_scaled <- scale(Boston)
# Summaries of the scaled variables
glimpse(boston_scaled)
## num [1:506, 1:14] -0.419 -0.417 -0.417 -0.416 -0.412 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:506] "1" "2" "3" "4" ...
## ..$ : chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:center")= Named num [1:14] 3.6135 11.3636 11.1368 0.0692 0.5547 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:scale")= Named num [1:14] 8.602 23.322 6.86 0.254 0.116 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
summary(boston_scaled)
## crim zn indus
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668
## Median :-0.390280 Median :-0.48724 Median :-0.2109
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202
## chas nox rm age
## Min. :-0.2723 Min. :-1.4644 Min. :-3.8764 Min. :-2.3331
## 1st Qu.:-0.2723 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366
## Median :-0.2723 Median :-0.1441 Median :-0.1084 Median : 0.3171
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2723 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059
## Max. : 3.6648 Max. : 2.7296 Max. : 3.5515 Max. : 1.1164
## dis rad tax ptratio
## Min. :-1.2658 Min. :-0.9819 Min. :-1.3127 Min. :-2.7047
## 1st Qu.:-0.8049 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876
## Median :-0.2790 Median :-0.5225 Median :-0.4642 Median : 0.2746
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6617 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058
## Max. : 3.9566 Max. : 1.6596 Max. : 1.7964 Max. : 1.6372
## black lstat medv
## Min. :-3.9033 Min. :-1.5296 Min. :-1.9063
## 1st Qu.: 0.2049 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median : 0.3808 Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4332 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 0.4406 Max. : 3.5453 Max. : 2.9865
# Class of boston_scaled object
class(boston_scaled)
## [1] "matrix"
# Converting to data frame
boston_scaled <- as.data.frame(boston_scaled)
# Summary of the scaled crime rate
summary(Boston$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
# Quantile vector of 'crim'
bins <- quantile(boston_scaled$crim)
bins
## 0% 25% 50% 75% 100%
## -0.419366929 -0.410563278 -0.390280295 0.007389247 9.924109610
# Categorical variable 'crime' from 'crim'
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, label = c("low", "med_low", "med_high", "high"))
# Tabulation of the new factor crime
table(crime)
## crime
## low med_low med_high high
## 127 126 126 127
# Removing original 'crim' from the dataset
boston_scaled <- dplyr::select(boston_scaled, -crim)
# Adding the new categorical value to scaled data
boston_scaled <- data.frame(boston_scaled, crime)
# Number of rows in the Boston dataset
n <- nrow(boston_scaled)
n
## [1] 506
# Choosing randomly 80% of the rows
ind <- sample(n, size = n * 0.8)
ind
## [1] 504 30 39 103 475 345 180 461 8 59 92 449 132 371 200 464 412
## [18] 294 230 386 444 432 447 349 146 451 390 394 284 134 398 10 241 258
## [35] 61 402 360 490 287 271 174 80 144 248 100 244 335 88 418 458 263
## [52] 276 192 143 460 250 443 289 11 454 427 68 351 330 296 206 150 176
## [69] 246 213 37 348 159 318 204 139 439 23 35 107 212 429 185 71 264
## [86] 166 208 147 65 441 199 430 363 157 397 51 104 90 253 421 140 41
## [103] 175 16 27 501 81 314 42 265 209 489 283 405 478 356 2 79 471
## [120] 148 391 129 171 91 156 275 119 137 316 158 273 477 279 25 196 95
## [137] 242 428 221 280 48 138 67 152 110 467 4 75 179 1 240 207 126
## [154] 54 28 243 155 223 105 40 481 203 120 476 164 189 353 169 341 31
## [171] 52 145 503 409 331 473 433 238 442 272 469 216 231 286 347 380 486
## [188] 72 310 106 76 114 93 325 455 205 222 423 374 36 193 401 57 21
## [205] 358 33 270 178 131 506 491 235 392 249 440 197 366 254 149 194 74
## [222] 383 47 121 49 465 445 15 354 381 214 340 453 163 111 319 82 425
## [239] 382 66 448 484 170 32 389 109 414 251 355 177 278 165 480 98 97
## [256] 342 435 160 495 326 290 303 130 12 470 260 101 315 87 321 220 479
## [273] 369 313 247 300 64 307 9 22 407 55 20 29 415 256 232 274 502
## [290] 6 302 468 188 233 372 73 437 112 3 77 393 201 118 422 344 305
## [307] 500 417 497 154 102 413 191 395 133 304 334 219 410 63 375 343 34
## [324] 17 153 350 339 403 162 268 285 488 492 237 225 184 125 367 60 24
## [341] 295 400 136 361 376 116 297 86 456 379 62 399 128 262 261 298 493
## [358] 328 122 269 396 329 426 187 311 18 485 337 142 291 362 494 26 161
## [375] 424 322 53 255 141 498 352 151 299 46 99 317 324 123 58 115 463
## [392] 210 462 373 446 190 438 227 217 406 226 245 198 228
# Training set
train <- boston_scaled[ind,]
# Test set
test <- boston_scaled[-ind,]
# Saving correct classes from test data
correct_classes <- test$crime
# Removing 'crime' variable from test data
test <- dplyr::select(test, -crime)
# Linear discriminant analysis
lda.fit <- lda(crime ~., data = train)
# lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
##
## Prior probabilities of groups:
## low med_low med_high high
## 0.2450495 0.2549505 0.2574257 0.2425743
##
## Group means:
## zn indus chas nox rm
## low 1.00241548 -0.8981354 -0.07348562 -0.8748873 0.45366022
## med_low -0.06866775 -0.2639455 -0.04298342 -0.5671943 -0.12345441
## med_high -0.38664383 0.2168326 0.25766519 0.4406764 0.06830265
## high -0.48724019 1.0171960 -0.15180559 1.0741345 -0.43451495
## age dis rad tax ptratio
## low -0.9149105 0.9306396 -0.6999747 -0.6941778 -0.4609621
## med_low -0.3104214 0.3754867 -0.5347496 -0.4493509 -0.0458316
## med_high 0.5176840 -0.3857206 -0.4330365 -0.2923154 -0.2650426
## high 0.8175049 -0.8651228 1.6373367 1.5134896 0.7798552
## black lstat medv
## low 0.37579983 -0.77710098 0.50164250
## med_low 0.31558307 -0.12474759 -0.01665684
## med_high 0.09328192 0.06999259 0.12074050
## high -0.74258543 0.93562064 -0.71375065
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## zn 0.14920119 0.62067541 -0.98111940
## indus -0.01808087 -0.17327756 0.43784204
## chas -0.03412383 -0.07107150 -0.05388446
## nox 0.46795603 -0.71830741 -1.29956407
## rm 0.02709179 -0.06833023 -0.20881522
## age 0.24127786 -0.51831360 -0.11749029
## dis -0.12620548 -0.31685410 0.22974654
## rad 3.45448507 0.84328095 0.07406935
## tax -0.03856702 0.09324052 0.29774834
## ptratio 0.17278539 -0.04528434 -0.31276784
## black -0.10592436 0.02895812 0.11258836
## lstat 0.23336635 -0.23155085 0.38202877
## medv 0.12219054 -0.44903465 -0.07527684
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.9517 0.0370 0.0113
# Function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
classes <- as.numeric(train$crime)
# plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 2)
# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
tab <- table(correct = correct_classes, predicted = lda.pred$class)
tab
## predicted
## correct low med_low med_high high
## low 16 10 2 0
## med_low 4 13 6 0
## med_high 0 9 11 2
## high 0 0 0 29
pred1 <- rbind(tab[1, ]/sum(tab[1, ]), tab[2, ]/sum(tab[2, ]))
pred2 <- rbind(tab[3, ]/sum(tab[3, ]), tab[4, ]/sum(tab[4, ]))
Predict_accuracy <- rbind(pred1, pred2)
rownames(Predict_accuracy) <- colnames(Predict_accuracy)
Predict_accuracy
## low med_low med_high high
## low 0.5714286 0.3571429 0.07142857 0.00000000
## med_low 0.1739130 0.5652174 0.26086957 0.00000000
## med_high 0.0000000 0.4090909 0.50000000 0.09090909
## high 0.0000000 0.0000000 0.00000000 1.00000000
require(caret)
## Loading required package: caret
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following objects are masked from 'package:InformationValue':
##
## confusionMatrix, precision, sensitivity, specificity
confusionMatrix(correct_classes, lda.pred$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction low med_low med_high high
## low 16 10 2 0
## med_low 4 13 6 0
## med_high 0 9 11 2
## high 0 0 0 29
##
## Overall Statistics
##
## Accuracy : 0.6765
## 95% CI : (0.5766, 0.7658)
## No Information Rate : 0.3137
## P-Value [Acc > NIR] : 6.044e-14
##
## Kappa : 0.568
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: low Class: med_low Class: med_high Class: high
## Sensitivity 0.8000 0.4062 0.5789 0.9355
## Specificity 0.8537 0.8571 0.8675 1.0000
## Pos Pred Value 0.5714 0.5652 0.5000 1.0000
## Neg Pred Value 0.9459 0.7595 0.9000 0.9726
## Prevalence 0.1961 0.3137 0.1863 0.3039
## Detection Rate 0.1569 0.1275 0.1078 0.2843
## Detection Prevalence 0.2745 0.2255 0.2157 0.2843
## Balanced Accuracy 0.8268 0.6317 0.7232 0.9677
# Euclidean distance matrix
boston_scaled <- dplyr::select(boston_scaled, -crime)
dist_eu <- dist(boston_scaled)
# Summary of the distances
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.3984 4.7296 4.7597 6.0150 12.5315
# Manhattan distance matrix
dist_man <- dist(boston_scaled, method = 'manhattan')
# Summary of the distances
summary(dist_man)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2645 8.2073 12.0993 12.8752 16.8027 43.5452
# k-means clustering with 4
km4 <-kmeans(boston_scaled, centers = 4)
# plot the Boston dataset with clusters
# For this, we choose 5 variables - dis, medv, black, lstat and tax
pairs(boston_scaled[c("dis", "medv", "black", "lstat", "tax")], col = km4$cluster)
# k-means clustering with 3
km3 <-kmeans(boston_scaled, centers = 3)
# plot the Boston dataset with clusters
# For this, we choose 5 variables - dis, medv, black, lstat and tax
pairs(boston_scaled[c("dis", "medv", "black", "lstat", "tax")], col = km3$cluster)
set.seed(100)
# Compute and plot cluster addition & variance explained for k = 2 to k = 15.
k.max <- 15
data <- boston_scaled
clust_TSS <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
clust_TSS
## [1] 6565.000 4207.600 3519.743 3098.744 2746.623 2399.515 2143.440
## [8] 1955.816 1832.813 1736.480 1637.171 1561.280 1498.560 1464.093
## [15] 1385.043
plot(1:k.max, clust_TSS,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
# k-means clustering with 4
km_bonus <-kmeans(boston_scaled, centers = 4)
# Linear discriminant analysis with clusters from k-means as target
mat <- as.matrix(km_bonus$cluster)
cluster_target <- mat[ rownames(mat) %in% rownames(train), ]
lda.fit.bonus <- lda(cluster_target ~., data = train)
# target classes as numeric
classes <- as.numeric(cluster_target)
# Plot the lda results
plot(lda.fit.bonus, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 2)
Run the code below for the (scaled) train data that you used to fit the LDA. The code creates a matrix product, which is a projection of the data points.
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404 13
dim(lda.fit$scaling)
## [1] 13 3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
# Plot with crime class in train
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type = 'scatter3d', mode = 'markers', color = train$crime)
# Plot with k-means clusters
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type = 'scatter3d', mode = 'markers', color = cluster_target)
library("readxl")
library("ggplot2")
library("data.table")
library("dplyr")
library("stringr")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 1.4.2 ✔ purrr 0.2.5
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::lift() masks caret::lift()
## ✖ plotly::select() masks MASS::select(), dplyr::select()
## ✖ purrr::transpose() masks data.table::transpose()
library("GGally")
library("ggplot2")
library("MASS")
library("corrplot")
library("plotly")
library("purrr")
library("devtools")
library("ggbiplot")
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
##
## compact
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: grid
library("gridExtra")
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library("grid")
library("lattice")
library("FactoMineR")
analysis.data <- read.table("~/Documents/GitHub/IODS-project/data/humans2.txt",
header=TRUE)
glimpse(analysis.data)
## Observations: 155
## Variables: 8
## $ Life.expectancy <dbl> 81.6, 82.4, 83.0, 80.2, 81.6, 80.9, 80.9, 79...
## $ Exp.school.years <dbl> 17.5, 20.2, 15.8, 18.7, 17.9, 16.5, 18.6, 16...
## $ GNIncome <int> 64992, 42261, 56431, 44025, 45435, 43919, 39...
## $ Mat.Mort.Rate <int> 4, 6, 6, 5, 6, 7, 9, 28, 11, 8, 6, 4, 8, 4, ...
## $ Adol.Birth.Rate <dbl> 7.8, 12.1, 1.9, 5.1, 6.2, 3.8, 8.2, 31.0, 14...
## $ Rep.Parliament... <dbl> 39.6, 30.5, 28.5, 38.0, 36.9, 36.9, 19.9, 19...
## $ Edu.F2M <dbl> 1.0072389, 0.9968288, 0.9834369, 0.9886128, ...
## $ Lab.F2M <dbl> 0.8908297, 0.8189415, 0.8251001, 0.8840361, ...
# General summary of the dataset
summary(analysis.data)
## Life.expectancy Exp.school.years GNIncome Mat.Mort.Rate
## Min. :49.00 Min. : 5.40 Min. : 581 Min. : 1.0
## 1st Qu.:66.30 1st Qu.:11.25 1st Qu.: 4198 1st Qu.: 11.5
## Median :74.20 Median :13.50 Median : 12040 Median : 49.0
## Mean :71.65 Mean :13.18 Mean : 17628 Mean : 149.1
## 3rd Qu.:77.25 3rd Qu.:15.20 3rd Qu.: 24512 3rd Qu.: 190.0
## Max. :83.50 Max. :20.20 Max. :123124 Max. :1100.0
## Adol.Birth.Rate Rep.Parliament... Edu.F2M Lab.F2M
## Min. : 0.60 Min. : 0.00 Min. :0.1717 Min. :0.1857
## 1st Qu.: 12.65 1st Qu.:12.40 1st Qu.:0.7264 1st Qu.:0.5984
## Median : 33.60 Median :19.30 Median :0.9375 Median :0.7535
## Mean : 47.16 Mean :20.91 Mean :0.8529 Mean :0.7074
## 3rd Qu.: 71.95 3rd Qu.:27.95 3rd Qu.:0.9968 3rd Qu.:0.8535
## Max. :204.80 Max. :57.50 Max. :1.4967 Max. :1.0380
# Matrix of the variables
pairs(analysis.data)
# Correlation matrix
cor_matrix <- cor(analysis.data) %>% round(2)
corrplot(cor_matrix, method = "circle", type = "upper",
cl.pos = "b", tl.pos = "d", tl.cex = 0.6)
# Visualise distributions of the variables
analysis.data %>% keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) + facet_wrap(~ key, scales = "free") + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# PCA
pca_human <- prcomp(analysis.data)
# Draw a biplot of the principal component representation and the original variables
# Using the ggbiplot package instead of normal biplot
ggbiplot(pca_human, choices=c(1,2), ellipse = TRUE, labels = rownames(analysis.data)) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
# Standardize dataset
human_std <- scale(analysis.data)
# PCA
pca_human_std <- prcomp(human_std)
# Draw a biplot of the principal component representation and the original variables
# Using the ggbiplot package instead of normal biplot
ggbiplot(pca_human_std, choices=c(1,2), ellipse = TRUE, labels = rownames(analysis.data)) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
pca_human
## Standard deviations (1, .., p=8):
## [1] 1.854416e+04 1.855219e+02 2.518701e+01 1.145441e+01 3.766241e+00
## [6] 1.565912e+00 1.912052e-01 1.591112e-01
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4
## Life.expectancy -2.815823e-04 -0.0283150248 -1.294971e-02 6.752684e-02
## Exp.school.years -9.562910e-05 -0.0075529759 -1.427664e-02 3.313505e-02
## GNIncome -9.999832e-01 0.0057723054 5.156742e-04 -4.932889e-05
## Mat.Mort.Rate 5.655734e-03 0.9916320120 -1.260302e-01 6.100534e-03
## Adol.Birth.Rate 1.233961e-03 0.1255502723 9.918113e-01 -5.301595e-03
## Rep.Parliament... -5.526460e-05 -0.0032317269 7.398331e-03 9.971232e-01
## Edu.F2M -5.607472e-06 -0.0006713951 3.412027e-05 2.736326e-04
## Lab.F2M 2.331945e-07 0.0002819357 -5.302884e-04 4.692578e-03
## PC5 PC6 PC7 PC8
## Life.expectancy 0.9865644425 1.453515e-01 -5.380452e-03 2.281723e-03
## Exp.school.years 0.1431180282 -9.882477e-01 3.826887e-02 7.776451e-03
## GNIncome -0.0001135863 2.711698e-05 8.075191e-07 -1.176762e-06
## Mat.Mort.Rate 0.0266373214 -1.695203e-03 -1.355518e-04 8.371934e-04
## Adol.Birth.Rate 0.0188618600 -1.273198e-02 8.641234e-05 -1.707885e-04
## Rep.Parliament... -0.0716401914 2.309896e-02 2.642548e-03 2.680113e-03
## Edu.F2M -0.0022935252 -2.180183e-02 -6.998623e-01 7.139410e-01
## Lab.F2M 0.0022190154 -3.264423e-02 -7.132267e-01 -7.001533e-01
summary(pca_human)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.854e+04 185.5219 25.19 11.45 3.766 1.566 0.1912
## Proportion of Variance 9.999e-01 0.0001 0.00 0.00 0.000 0.000 0.0000
## Cumulative Proportion 9.999e-01 1.0000 1.00 1.00 1.000 1.000 1.0000
## PC8
## Standard deviation 0.1591
## Proportion of Variance 0.0000
## Cumulative Proportion 1.0000
# Compute standard deviation of each principal component
std_dev <- pca_human$sdev
# Compute variance
pr_var <- std_dev^2
# Proportion of variance explained
prop_varex <- pr_var/sum(pr_var)
# Scree plot
plot(prop_varex, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")
# Cumulative scree plot
plot(cumsum(prop_varex), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")
# Plot each variables coefficients inside a unit circle to get insight on a possible interpretation for PCs
theta <- seq(0,2*pi,length.out = 100)
circle <- data.frame(x = cos(theta), y = sin(theta))
p <- ggplot(circle,aes(x,y)) + geom_path()
loadings <- data.frame(pca_human$rotation,
.names = row.names(pca_human$rotation))
p + geom_text(data=loadings,
mapping=aes(x = PC1, y = PC2, label = .names, colour = .names)) +
coord_fixed(ratio=1) +
labs(x = "PC1", y = "PC2")
pca_human_std
## Standard deviations (1, .., p=8):
## [1] 2.0708380 1.1397204 0.8750485 0.7788630 0.6619563 0.5363061 0.4589994
## [8] 0.3222406
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4
## Life.expectancy -0.44372240 0.02530473 -0.10991305 0.05834819
## Exp.school.years -0.42766720 -0.13940571 0.07340270 0.07020294
## GNIncome -0.35048295 -0.05060876 0.20168779 0.72727675
## Mat.Mort.Rate 0.43697098 -0.14508727 0.12522539 0.25170614
## Adol.Birth.Rate 0.41126010 -0.07708468 -0.01968243 -0.04986763
## Rep.Parliament... -0.08438558 -0.65136866 -0.72506309 -0.01396293
## Edu.F2M -0.35664370 -0.03796058 0.24223089 -0.62678110
## Lab.F2M 0.05457785 -0.72432726 0.58428770 -0.06199424
## PC5 PC6 PC7 PC8
## Life.expectancy -0.1628935 0.42242796 -0.43406432 -0.62737008
## Exp.school.years -0.1659678 0.38606919 0.77962966 0.05415984
## GNIncome 0.4950306 -0.11120305 -0.13711838 0.16961173
## Mat.Mort.Rate 0.1800657 -0.17370039 0.35380306 -0.72193946
## Adol.Birth.Rate 0.4672068 0.76056557 -0.06897064 0.14335186
## Rep.Parliament... 0.1523699 -0.13749772 0.00568387 0.02306476
## Edu.F2M 0.5983585 -0.17713316 0.05773644 -0.16459453
## Lab.F2M -0.2625067 0.03500707 -0.22729927 0.07304568
summary(pca_human_std)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.0708 1.1397 0.87505 0.77886 0.66196 0.53631
## Proportion of Variance 0.5361 0.1624 0.09571 0.07583 0.05477 0.03595
## Cumulative Proportion 0.5361 0.6984 0.79413 0.86996 0.92473 0.96069
## PC7 PC8
## Standard deviation 0.45900 0.32224
## Proportion of Variance 0.02634 0.01298
## Cumulative Proportion 0.98702 1.00000
# Compute standard deviation of each principal component
std_dev <- pca_human_std$sdev
# Compute variance
pr_var <- std_dev^2
# Proportion of variance explained
prop_varex <- pr_var/sum(pr_var)
# Scree plot
plot(prop_varex, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")
# Cumulative scree plot
plot(cumsum(prop_varex), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")
# Plot each variables coefficients inside a unit circle to get insight on a possible interpretation for PCs
theta <- seq(0,2*pi,length.out = 100)
circle <- data.frame(x = cos(theta), y = sin(theta))
p <- ggplot(circle,aes(x,y)) + geom_path()
loadings <- data.frame(pca_human_std$rotation,
.names = row.names(pca_human_std$rotation))
p + geom_text(data=loadings,
mapping=aes(x = PC1, y = PC2, label = .names, colour = .names)) +
coord_fixed(ratio=1) +
labs(x = "PC1", y = "PC2")
data(tea)
# Select few variables
keep <- c("Tea", "frequency", "sex", "How", "relaxing", "effect.on.health")
tea_select <- dplyr::select(tea, one_of(keep))
# Summary of the subsetted data
glimpse(tea_select)
## Observations: 300
## Variables: 6
## $ Tea <fct> black, black, Earl Grey, Earl Grey, Earl Grey...
## $ frequency <fct> 1/day, 1/day, +2/day, 1/day, +2/day, 1/day, 3...
## $ sex <fct> M, F, F, M, M, M, M, F, M, M, M, M, M, M, M, ...
## $ How <fct> alone, milk, alone, alone, alone, alone, alon...
## $ relaxing <fct> No.relaxing, No.relaxing, relaxing, relaxing,...
## $ effect.on.health <fct> No.effect on health, No.effect on health, No....
# Visualise distributions of the variables
gather(tea_select) %>%
ggplot(aes(value)) +
facet_wrap("key", scales = "free") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped
What we see here is that Early Grey is the most consumed kind of Tea,majoriy responding Tea as relaxing however, no effect on health is seen on many, with majority of females amongst the responders and many responders have it alone at a high frequency of +2/day.
tea_mca <- MCA(tea_select, graph = FALSE)
# Summary of MCA
summary(tea_mca)
##
## Call:
## MCA(X = tea_select, graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## Variance 0.227 0.213 0.187 0.185 0.182 0.174
## % of var. 12.387 11.631 10.218 10.080 9.908 9.468
## Cumulative % of var. 12.387 24.018 34.236 44.316 54.225 63.692
## Dim.7 Dim.8 Dim.9 Dim.10 Dim.11
## Variance 0.152 0.148 0.130 0.119 0.116
## % of var. 8.267 8.098 7.092 6.513 6.338
## Cumulative % of var. 71.959 80.057 87.149 93.662 100.000
##
## Individuals (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr
## 1 | 0.438 0.282 0.126 | 0.224 0.078 0.033 | -0.423 0.318
## 2 | 0.328 0.158 0.056 | 0.282 0.125 0.041 | -0.767 1.046
## 3 | -0.636 0.594 0.603 | -0.342 0.183 0.174 | 0.064 0.007
## 4 | 0.211 0.065 0.048 | -0.085 0.011 0.008 | -0.086 0.013
## 5 | -0.304 0.136 0.115 | -0.019 0.001 0.000 | 0.039 0.003
## 6 | 0.211 0.065 0.048 | -0.085 0.011 0.008 | -0.086 0.013
## 7 | 0.072 0.008 0.003 | -0.197 0.061 0.021 | 0.534 0.508
## 8 | -0.178 0.046 0.013 | 0.854 1.139 0.308 | 0.068 0.008
## 9 | -0.082 0.010 0.005 | 0.362 0.205 0.098 | -0.330 0.194
## 10 | -0.444 0.289 0.162 | 0.500 0.391 0.206 | -0.172 0.053
## cos2
## 1 0.117 |
## 2 0.304 |
## 3 0.006 |
## 4 0.008 |
## 5 0.002 |
## 6 0.008 |
## 7 0.152 |
## 8 0.002 |
## 9 0.082 |
## 10 0.024 |
##
## Categories (the 10 first)
## Dim.1 ctr cos2 v.test Dim.2 ctr cos2
## black | -0.374 2.527 0.046 -3.697 | 1.127 24.502 0.416
## Earl Grey | 0.027 0.034 0.001 0.624 | -0.312 4.895 0.176
## green | 0.681 3.740 0.057 4.138 | -0.703 4.252 0.061
## 1/day | 0.726 12.248 0.244 8.545 | -0.188 0.879 0.016
## 1 to 2/week | 0.330 1.169 0.019 2.362 | 0.811 7.541 0.113
## +2/day | -0.745 17.264 0.408 -11.044 | -0.006 0.001 0.000
## 3 to 6/week | 0.330 0.903 0.014 2.037 | -0.500 2.212 0.032
## F | -0.386 6.489 0.217 -8.063 | -0.364 6.136 0.193
## M | 0.563 9.467 0.217 8.063 | 0.531 8.953 0.193
## alone | -0.100 0.473 0.018 -2.345 | -0.294 4.389 0.160
## v.test Dim.3 ctr cos2 v.test
## black 11.154 | -0.245 1.320 0.020 -2.427 |
## Earl Grey -7.245 | 0.302 5.220 0.165 7.013 |
## green -4.275 | -1.216 14.478 0.183 -7.394 |
## 1/day -2.218 | -0.589 9.763 0.161 -6.929 |
## 1 to 2/week 5.814 | 1.250 20.375 0.268 8.958 |
## +2/day -0.093 | -0.266 2.665 0.052 -3.941 |
## 3 to 6/week -3.089 | 1.021 10.516 0.133 6.313 |
## F -7.598 | 0.027 0.037 0.001 0.557 |
## M 7.598 | -0.039 0.055 0.001 -0.557 |
## alone -6.926 | 0.158 1.436 0.046 3.714 |
##
## Categorical variables (eta2)
## Dim.1 Dim.2 Dim.3
## Tea | 0.086 0.430 0.236 |
## frequency | 0.430 0.136 0.487 |
## sex | 0.217 0.193 0.001 |
## How | 0.201 0.310 0.262 |
## relaxing | 0.259 0.080 0.025 |
## effect.on.health | 0.169 0.130 0.113 |
# Description on the dimension of MCA:
dimdesc(tea_mca)
## $`Dim 1`
## $`Dim 1`$quali
## R2 p.value
## frequency 0.43035283 6.126221e-36
## relaxing 0.25926867 3.411902e-21
## sex 0.21740720 1.341552e-17
## How 0.20055142 2.561497e-14
## effect.on.health 0.16915112 1.127903e-13
## Tea 0.08585463 1.624676e-06
##
## $`Dim 1`$category
## Estimate p.value
## No.relaxing 0.25038745 3.411902e-21
## 1/day 0.26975535 7.022606e-20
## M 0.22617501 1.341552e-17
## effect on health 0.23656752 1.127903e-13
## milk 0.43694935 1.152059e-06
## green 0.27131834 2.810580e-05
## 1 to 2/week 0.08083804 1.790922e-02
## alone 0.13398295 1.876177e-02
## 3 to 6/week 0.08083749 4.143368e-02
## black -0.23108131 1.910251e-04
## other -0.81706844 4.433926e-11
## No.effect on health -0.23656752 1.127903e-13
## F -0.22617501 1.341552e-17
## relaxing -0.25038745 3.411902e-21
## +2/day -0.43143088 8.756742e-36
##
##
## $`Dim 2`
## $`Dim 2`$quali
## R2 p.value
## Tea 0.43049300 4.915996e-37
## How 0.30966049 1.171181e-23
## sex 0.19305357 1.366580e-15
## effect.on.health 0.13023599 1.171471e-10
## frequency 0.13603174 2.070879e-09
## relaxing 0.07990218 6.443703e-07
##
## $`Dim 2`$category
## Estimate p.value
## black 0.50330392 1.085763e-36
## M 0.20652085 1.366580e-15
## milk 0.09529808 1.587371e-12
## effect on health 0.20114095 1.171471e-10
## other 0.67944635 2.015273e-10
## 1 to 2/week 0.36104435 2.310087e-09
## relaxing 0.13468963 6.443703e-07
## 1/day -0.10046877 2.632170e-02
## 3 to 6/week -0.24421119 1.893538e-03
## green -0.34198306 1.479665e-05
## No.relaxing -0.13468963 6.443703e-07
## No.effect on health -0.20114095 1.171471e-10
## alone -0.39248172 5.493201e-13
## Earl Grey -0.16132086 3.485479e-14
## F -0.20652085 1.366580e-15
##
##
## $`Dim 3`
## $`Dim 3`$quali
## R2 p.value
## frequency 0.48690576 1.239113e-42
## How 0.26195167 2.130076e-19
## Tea 0.23624924 4.152609e-18
## effect.on.health 0.11251437 2.535985e-09
## relaxing 0.02536586 5.696963e-03
##
## $`Dim 3`$category
## Estimate p.value
## 1 to 2/week 0.38761770 5.303215e-22
## Earl Grey 0.29800933 2.621465e-13
## 3 to 6/week 0.28878032 6.837716e-11
## effect on health 0.17523761 2.535985e-09
## lemon 0.45654646 7.757408e-08
## alone 0.15038168 1.782129e-04
## relaxing 0.07113249 5.696963e-03
## black 0.06115032 1.499559e-02
## No.relaxing -0.07113249 5.696963e-03
## other -0.34255089 2.707970e-03
## +2/day -0.26836999 6.790217e-05
## No.effect on health -0.17523761 2.535985e-09
## 1/day -0.40802803 5.332058e-13
## milk -0.26437725 8.963243e-14
## green -0.35915965 9.118354e-15
# Plotting MCA
plot(tea_mca, invisible=c("ind"), habillage = "quali")